## File Description:
# Purpose: Appendix for CER Electricity Trial Analysis
# Author: Brian Prest
# This file does the following:
# 1) Sample Representativeness
# 2) Sensitivity: Randomly Assigned Awareness in Control Group
# 3) Real Time Price Simulations
# 4) Response by Tariff Group
# 5) Honest Tree
# 6) Differential responses by temperature and appliances
# 7) Time-varying responses
# 8) Varying response by income/tariff groups
# 9) Effect on Bills/CS
# 10) Checking side statistics
library(data.table)
library(zoo)
library(lubridate)
library(lfe)
library(dummies)
library(causalTree)
library(xlsx)
library(stargazer)
library(xtable)
library(tree)
library(randomForest)
library(Hmisc)

# Concatenating function
"%&%" <- function(x,y) paste0(x,y)

# Set directories
working_dir = '/your/working/directory/Replication Code/'
data_dir = working_dir%&%'/Data/'
output_dir = working_dir%&%'/Output/'
graph_dir = output_dir%&%'/Graphs/'

# Load data from main analysis
load(output_dir%&%'working_electricity_data.Rdata')

#### For Appendix
#### 1) Representativeness -----
names(dt.reg)
sum.stats.vars = c('f_internet','f_broadband','n_adults','n_kids','d_has_kids','n_year_built',
                   'fs_approx_home_age.[0,5) years','fs_approx_home_age.[10,30) years',
                   'fs_approx_home_age.[5,10) years','fs_approx_home_age.[30,75) years',
                   'fs_approx_home_age.75plus years', 'n_home_size',
                   'n_bedrooms','d_heat_elec_central','d_heat_elec_plugin','d_heat_gas',
                   'd_heat_oil','d_heat_solid_fuel','d_heat_renew','d_heat_other','d_immersion', 'n_immersion',
                   'd_waterheater_central','d_waterheater_elec_immersion','d_waterheater_elec_instant',
                   'd_waterheater_gas','d_waterheater_oil','d_waterheater_solid','d_waterheater_renew',
                   'd_waterheater_other','d_waterheater_timer','d_washing_machine','n_washing_machine',
                   'd_tumble_dryer', 'n_tumble_dryer','d_dishwasher','n_dishwasher',
                   'd_elec_shower_instant','d_elec_shower_pumped','d_elec_cook', 'd_elec_heater_appliance_plugin',
                   'd_standalone_freezer','n_standalone_freezer','d_water_pump',
                   'n_tv_small','n_tv_large','n_desktop_computer','n_laptop_compter','n_game_console',
                   'd_hot_water_lagging_jacket','d_walls_insulated','d_participate_reduce_usage',
                   'n_expect_major_changes', 'fs_age.18-25','fs_age.26-35',
                   'fs_age.36-45','fs_age.46-55','fs_age.56-65','fs_age.65+','fs_age.NA',
                   'fs_social_class.AB','fs_social_class.C1','fs_social_class.C2','fs_social_class.DE',
                   'fs_social_class.Farmer','fs_social_class.NA',
                   'fs_education.None','fs_education.Primary','fs_education.Secondary to Cert',
                   'fs_education.Secondary without Cert','fs_education.Third','fs_education.NA',
                   'fs_own_or_rent.Own outright','fs_own_or_rent.Own with mortgage',
                   'fs_cook_type.ElectricCook','fs_cook_type.GasCook',
                   'fs_cook_type.OilCook','fs_cook_type.SolidCook',
                   'fs_home_style.Apartment','fs_home_style.Bungalow',
                   'fs_home_style.Detached','fs_home_style.Semi-detached',
                   'fs_home_style.Terraced','fs_home_style.NA')
sum.stats.vars
round(sapply(dt.reg[, sum.stats.vars, with=F], mean, na.rm=TRUE),2)
library(pastecs)
sum.stats = t(stat.desc(dt.reg[, sum.stats.vars, with=FALSE])[c('mean','std.dev','min','max'),])

# From "Appendices to the Electricity Smart Metering CBT Findings Report"
# https://www.cru.ie/wp-content/uploads/2011/07/cer11080aii.pdf
# Section A3.3, non-response survey data.
grep('internet', sum.stats.vars, v=T)
nresp = c( # Demographics
  'fs_age.18-25'=0.01,
  'fs_age.26-35'=0.12,
  'fs_age.36-45'=0.22,
  'fs_age.46-55'=0.19,
  'fs_age.56-65'=0.20,
  'fs_age.65+'=0.25,
  'n_adults'=1.98,
  'n_kids'=0.71,
  'd_has_kids'=0.36,
  'fs_social_class.AB'=0.13,
  'fs_social_class.C1'=0.26,
  'fs_social_class.C2'=0.17,
  'fs_social_class.DE'=0.40,
  'fs_social_class.Farmer'=0.04,
  'fs_education.None'=0.01,
  'fs_education.Primary'=0.16,
  'fs_education.Secondary'=0.49,
  'fs_education.Third'=0.29,
  'fs_education.NA'=0.04,
  # Home chars
  'fs_own'=0.88,
  'fs_approx_home_age.[0,5) years'=0.11,
  'fs_approx_home_age.[5,10) years'=0.12,
  'fs_approx_home_age.[10,30) years'=0.26,
  'fs_approx_home_age.[30,75) years'=0.38,
  'fs_approx_home_age.75plus years'=0.13,
  'fs_home_style.Apartment'=0.03,
  'fs_home_style.Bungalow'=0.21,
  'fs_home_style.Detached'=0.28,
  'fs_home_style.Semi-detached'=0.31,
  'fs_home_style.Terraced'=0.17,
  'n_bedrooms'=3.25,
  # Appliances
  'd_heat_elec'=0.13,
  'd_heat_oil'=0.43,
  'd_heat_gas'=0.33,
  'd_heat_solid_fuel'=0.11,
  'd_waterheater_elec_immersion'=0.59,
  'd_washing_machine'=0.98,
  'd_tumble_dryer'=0.67,
  'd_dishwasher'=0.58,
  'd_standalone_freezer'=0.55,
  'd_elec_cook'=0.69,
  'f_internet'=0.72
)
nresp = matrix(nresp, dimnames=list(names(nresp),'mean'))
nresp = data.table(nresp, keep.rownames=TRUE)

x = copy(dt.reg)
x[, fs_education.Secondary:= ifelse(`fs_education.Secondary to Cert`+`fs_education.Secondary without Cert`>0, 1, 0)]
x[, fs_own := ifelse(`fs_own_or_rent.Own outright`+`fs_own_or_rent.Own with mortgage`>0, 1, 0)]
x[, d_heat_elec := ifelse(d_heat_elec_central + d_heat_elec_plugin>0, 1, 0)]
if (!any(grepl('Secondary$', sum.stats.vars))) sum.stats.vars = c(sum.stats.vars, 'fs_education.Secondary')
if (!any(grepl('fs_own$', sum.stats.vars))) sum.stats.vars = c(sum.stats.vars, 'fs_own')
if (!any(grepl('d_heat_elec$', sum.stats.vars))) sum.stats.vars = c(sum.stats.vars, 'd_heat_elec')
sum.stats.se = t(stat.desc(x[, sum.stats.vars, with=FALSE])[c('mean','SE.mean','CI.mean.0.95'),])
sum.stats.se = data.table(sum.stats.se, keep.rownames=TRUE)
setkey(sum.stats.se, rn)
sum.stats.se[,lwr:=mean-CI.mean.0.95]
sum.stats.se[,upr:=mean+CI.mean.0.95]

sum.stats.se = merge(sum.stats.se, nresp, by='rn')
sum.stats.se[, diff.xy := mean.x-mean.y]
sum.stats.se[, t.stat := diff.xy/SE.mean]
sum.stats.se[, p.val := round(2*(1-pnorm(abs(t.stat))),3)]
sum.stats.se = sum.stats.se[nresp[,rn], .(rn, mean.x, mean.y, diff.xy, p.val)]
sum.stats.se

sum.stats.se[, .N]; nrow(nresp)

sum.stats.se[, .(rn, diff.xy=round(diff.xy, 2))]

# Output Table A.1
xtable(sum.stats.se)

#### 2) Sensitivity: Re-run diff in diff with alternative randomly assigned awareness indicators -----
set.seed(1)
coef.pvals = vector("list", 1000) 
dt.rand.aware = copy(dt.reg)

dt.rand.aware[any.trt==1, d_aware_of_tariff_change_pred := d_aware_of_tariff_change]
for (i in 1:length(coef.pvals)) {
  dt.rand.aware[any.trt==0, d_aware_of_tariff_change_pred := rbinom(.N, 1, 0.5)] # coin flip
  
  coef.pvals[[i]] = summary(lm(d.kwh.Peak ~ any.trt*d_aware_of_tariff_change_pred, data=dt.rand.aware))$coefficients
}
rand.aware.coefs = sapply(coef.pvals, function(x) x['any.trt:d_aware_of_tariff_change_pred','Estimate'])
rand.aware.pvals = sapply(coef.pvals, function(x) x['any.trt:d_aware_of_tariff_change_pred','Pr(>|t|)'])
hist(rand.aware.coefs)
mean(rand.aware.coefs) # average coefficient (similar but slightly smaller than point est. Within CI)
mean(rand.aware.coefs<0) # how many are negative (100%)
mean(rand.aware.pvals<0.05) # how many are signif? (98%)
hist(rand.aware.pvals) # histogram of p values

# Plot distribution of awareness-treatment interaction coefficients (Figure A.5)
if (write.out==TRUE) pdf(file=graph_dir%&%'/Random_Imputation_Coefficient_Graph-'%&%today()%&%'.pdf', 
                         family='serif', width=10, height=7)
par(mai=c(0.85,0.85,0.5,0.15))
plot(density(100*rand.aware.coefs, bw=0.5), xlim=100*c(-0.14, 0), lwd=2, 
     main='', xlab='Impact on Treatment Effect (Percentage Points)',
     cex.lab=1.75, cex.main=1.75, cex.axis=1.75)
grid(col='gray70')
aware.point.est = summary(lm(d.kwh.Peak ~ any.trt*d_aware_of_tariff_change, data=dt.reg))$coefficients[4,]
abline(v=100*mean(rand.aware.coefs), lwd=2, lty=2)
abline(v=100*aware.point.est['Estimate'], col='blue', lwd=2, lty=4)
legend('topright', legend=c('Kernel Density','Mean of Distribution','Point Estimate'),
       col=c('black','black','blue'), lty=c(1,2,4), lwd=2, cex=1.5, bty='n')
dev.off()

#### 3) Real-time pricing simulation ----

# Multiple steps:
# i) Estimate demand curves
# ii) Find what real time prices would be
# iii) Scale demand by estimated demand curves according to real time prices

# i) Estimate flexible demand curves for each leaf ---
node.nums = rownames(c.tree.peak$frame)
leaf.locs = data.table(id=dt.reg.dupe[, id], leaf.num=node.nums[c.tree.peak$where]) # returns row names of tree$frame
leaf.locs = leaf.locs[!duplicated(leaf.locs)] # removes duplicated controls in leaves (this in effect removes the dupes for tariffs)
rm(node.nums)

leaves = c.tree.peak$frame[c.tree.peak$frame[,'var']=='<leaf>',]
leaf.nums = rownames(leaves)
period.list = c('Peak','Day','Night')

deg = 2
lm.rate.peak = list()
dt.leaf = merge(dt.reg, leaf.locs, allow.cartesian=TRUE, all.x=TRUE, all.y=TRUE)
for (i in 1:length(leaf.nums)) { # change this so that the leaves depend on t (i.e. no leaves for t=2 or 3)
  lf = leaf.nums[i]
  dt.temp = dt.leaf[leaf.num==lf]
  
  lm.rate.peak[[i]] = lm(d.kwh.Peak ~ -1 + as.factor(rate.Peak_trt.period), data=dt.temp)
  # summary(lm.rate.peak)
}
lm.rate.peak.all = lm(d.kwh.Peak ~ -1 + as.factor(rate.Peak_trt.period), data=dt.reg)
summary(lm.rate.peak.all)

lapply(lm.rate.peak, summary)

lm.rate.day = lm(d.kwh.Day ~ -1 + as.factor(rate.Day_trt.period), data=dt.reg)
lm.rate.night = lm(d.kwh.Night ~ -1 + as.factor(rate.Night_trt.period), data=dt.reg)
summary(lm.rate.day)
summary(lm.rate.night)

fit = copy(lm.rate.peak[[1]])
regex = 'period\\)'
grid = 7:15
rm(list=c('fit','regex','grid'))

get.pts = function(fit, regex) {
  coef.mat = summary(fit)$coefficients[grep(regex,names(fit$coefficients)),]
  coefs = coef.mat[,1]
  coef.se = coef.mat[,2]
  coef.se = sqrt(coef.mat[,2]^2+coef.mat[grep('14.1',names(coefs)),2]^2) # std err of difference from control
  coef.se[grep('14.1',names(coefs))] = 0 # above line doesn't account for covariance, which only affects control
  coefs = coefs - coefs[grep('14.1',names(coefs))]
  coefs.upr = coefs + 1.96*coef.se
  coefs.lwr = coefs - 1.96*coef.se
  pts = substr(names(coefs), start= regexpr(regex, names(coefs)) + nchar(regex)-1, stop=nchar(names(coefs)))
  pts = as.numeric(pts)  
  return(list(coefs, pts, coefs.upr, coefs.lwr, coef.se))
}
my.interp = function(fit, regex, grid, extrap=TRUE) {
  library(Hmisc)
  coefs.pts = get.pts(fit, regex) # coefs (y's) in 1st element. pts (x's, price points) in 2nd element.
  if (extrap) out = approxExtrap(coefs.pts[[2]], coefs.pts[[1]], xout=grid, rule=2)
  if (!extrap) out =      approx(coefs.pts[[2]], coefs.pts[[1]], xout=grid, rule=2)
  return(out)
}

# Construct demand curves for plot
price.reso = 0.5
price.grid = seq(7, 45, by=price.reso)
d.kwh.pct_hat = array(NA, dim=c(length(price.grid), length(leaf.nums), length(period.list)),
                      dimnames = list(price.grid, leaf.nums, period.list))
q_hat = copy(d.kwh.pct_hat)

d.kwh.pct_hat.peak.all = my.interp(lm.rate.peak.all, regex='period\\)', grid=price.grid, extrap=TRUE)$y
q_hat.peak.all = (1+d.kwh.pct_hat.peak.all)*dt.reg[, mean(kwh.Peak_base)]
for (i in 1:length(lm.rate.peak)){
  d.kwh.pct_hat[,i,1] = my.interp(lm.rate.peak[[i]], regex='period\\)', grid=price.grid, extrap=TRUE)$y
  d.kwh.pct_hat[,i,2] = my.interp(lm.rate.day, regex='period\\)', grid=price.grid, extrap=TRUE)$y
  d.kwh.pct_hat[,i,3] = my.interp(lm.rate.night, regex='period\\)', grid=price.grid, extrap=TRUE)$y
  
  q_hat[,i,1] = (1+d.kwh.pct_hat[,i,1])*dt.leaf[leaf.num==leaf.nums[i], mean(kwh.Peak_base)]
  q_hat[,i,2] = (1+d.kwh.pct_hat[,i,2])*dt.reg[, mean(kwh.Day_base)]
  q_hat[,i,3] = (1+d.kwh.pct_hat[,i,3])*dt.reg[, mean(kwh.Night_base)]
} 
q_hat[3, 1, 1]
q_hat['7.5', '8', 'Peak']

# plot.all.curves = TRUE
# Output Figure 6
plot.all.curves = FALSE
if (plot.all.curves) file.out = 'Demand-Curves-Heterogeneous-'%&%today()%&%'.pdf'
if (!plot.all.curves) file.out = 'Demand-Curves-Aggregated-'%&%today()%&%'.pdf'
if (write.out==TRUE) pdf(file=graph_dir%&%'/'%&%file.out, family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.85,0.5,0.15)) # for pdf output

excluded.leaves = which(leaves[,'wt']<100)
num.to.plot = length(leaf.nums)-length(excluded.leaves)
my.cols = rev(rainbow(num.to.plot, start=0.3, end=1))
if (plot.all.curves) {
  col.no = 1
  peak.interp = list()
  for (i in length(lm.rate.peak):1) {
    rates[period=='Peak', range(rate)]
    to.plot = price.grid>=14.1 & price.grid<=38
    if (i %in% excluded.leaves) next
    if (i==length(lm.rate.peak)) {
      plot(price.grid[to.plot] ~ q_hat[to.plot,i,1], type='l', lwd=2, ylim=c(0,45), xlim=c(0.003, 0.6), #xlim=c(0, 1), 
           col=my.cols[1],  cex.lab=1.75, cex.main=1.75, cex.axis=1.75, # main='Peak Periods, by Leaf', 
           ylab='Price (€ cents per kWh)', xlab='Consumption (kWh per 30 minutes)')
      grid(col='gray20')
    }
    if (i<length(lm.rate.peak)) lines(price.grid[to.plot] ~ q_hat[to.plot,i,1], type='l',lwd=2, col=my.cols[col.no])
    
    coefs.pts = get.pts(lm.rate.peak[[i]], regex='period\\)')
    coefs = (1+coefs.pts[[1]])*dt.leaf[leaf.num==leaf.nums[i], mean(kwh.Peak_base)]
    pts = coefs.pts[[2]]
    points(pts ~ coefs, col=my.cols[col.no], cex=1, pch=1)
    
    col.no = col.no + 1
  }
  
  legend(x=0, y=40, legend=c('IHD Group','Monthly Bill Group','OLR Group','Bi-monthly Bill Group','Unaware, Large User Group')%&%
           ', Peak Hours', col=rev(my.cols), lty=1, bty='n', pch=1)
  legend(x=0, y=27.5, legend=c('All Households, Day Hours', 'All Households, Night Hours'), lty=2:3, col='black', bty='n', pch=c(0,2))
}

std.err.of.mean = function(x) sd(x)/sqrt(length(x))
if (plot.all.curves==FALSE) {
  # Plot peak curve
  coefs.pts = get.pts(lm.rate.peak.all, regex='period\\)')
  coefs = (1+coefs.pts[[1]])*dt.reg[, mean(kwh.Peak_base)]
  coefs.upr = (1+coefs.pts[[3]])*(dt.reg[, mean(kwh.Peak_base)] + 0*1.96*dt.reg[, std.err.of.mean(kwh.Peak_base)])
  coefs.lwr = (1+coefs.pts[[4]])*(dt.reg[, mean(kwh.Peak_base)] - 0*1.96*dt.reg[, std.err.of.mean(kwh.Peak_base)])
  pts = coefs.pts[[2]]
  rates[period=='Peak', range(rate)]
  to.plot = price.grid>=14.1 & price.grid<=38
  plot(price.grid[to.plot] ~ q_hat.peak.all[to.plot], type='l', lwd=2, ylim=c(0,45), xlim=c(0.003, 0.6), #xlim=c(0, 1), 
       col='black',  cex.lab=1.75, cex.main=1.75, cex.axis=1.75, # main='Peak Periods, by Leaf', 
       ylab='Price (€ cents per kWh)', xlab='Consumption (kWh per 30 minutes)')
  points(pts ~ coefs, cex=1, pch=1)
  grid(col='gray20')
  
  coefs.upr.line = approxExtrap(pts, coefs.upr, xout=price.grid[to.plot])$y
  coefs.lwr.line = approxExtrap(pts, coefs.lwr, xout=price.grid[to.plot])$y
  lines(price.grid[to.plot] ~ coefs.upr.line, lty=4)
  lines(price.grid[to.plot] ~ coefs.lwr.line, lty=4)
  
  legend(x=0, y=40, legend=c('Peak Hours','Day Hours', 'Night Hours'), lty=1:3, col='black', bty='n', pch=c(1,0,2), lwd=2, cex=1.5)
}
d.kwh.pct_hat.peak.all = my.interp(lm.rate.peak.all, regex='period\\)', grid=price.grid, extrap=TRUE)$y
q_hat.peak.all

# Plot daytime curve
coefs.pts = get.pts(lm.rate.day, regex='period\\)')
coefs = (1+coefs.pts[[1]])*dt.reg[, mean(kwh.Day_base)]
pts = coefs.pts[[2]]
rates[period=='Day', range(rate)]
to.plot = price.grid>=12.5 & price.grid<=14.1
lines(price.grid[to.plot] ~ q_hat[to.plot,1,2],lwd=2, lty=2)
points(pts ~ coefs, cex=1, pch=0)

# Plot std errors
if (plot.all.curves==FALSE) {
  coefs.upr = (1+coefs.pts[[3]])*(dt.reg[, mean(kwh.Day_base)] + 0*1.96*dt.reg[, std.err.of.mean(kwh.Day_base)])
  coefs.lwr = (1+coefs.pts[[4]])*(dt.reg[, mean(kwh.Day_base)] - 0*1.96*dt.reg[, std.err.of.mean(kwh.Day_base)])
  coefs.upr.line = approxExtrap(pts, coefs.upr, xout=price.grid[to.plot])$y
  coefs.lwr.line = approxExtrap(pts, coefs.lwr, xout=price.grid[to.plot])$y
  lines(price.grid[to.plot] ~ coefs.upr.line, lty=4)
  lines(price.grid[to.plot] ~ coefs.lwr.line, lty=4)
}

# Plot nighttime curve
coefs.pts = get.pts(lm.rate.night, regex='period\\)')
coefs = (1+coefs.pts[[1]])*dt.reg[, mean(kwh.Night_base)]
pts = coefs.pts[[2]]
rates[period=='Night', range(rate)]
to.plot = price.grid>=9 & price.grid<=14.1
lines(price.grid[to.plot] ~ q_hat[to.plot,1,3], lwd=3, lty=3)
points(pts ~ coefs, cex=1, pch=2)

# Plot std errors
if (plot.all.curves==FALSE) {
  coefs.upr = (1+coefs.pts[[3]])*(dt.reg[, mean(kwh.Night_base)] + 0*1.96*dt.reg[, std.err.of.mean(kwh.Night_base)])
  coefs.lwr = (1+coefs.pts[[4]])*(dt.reg[, mean(kwh.Night_base)] - 0*1.96*dt.reg[, std.err.of.mean(kwh.Night_base)])
  coefs.upr.line = approxExtrap(pts, coefs.upr, xout=price.grid[to.plot])$y
  coefs.lwr.line = approxExtrap(pts, coefs.lwr, xout=price.grid[to.plot])$y
  lines(price.grid[to.plot] ~ coefs.upr.line, lty=4)
  lines(price.grid[to.plot] ~ coefs.lwr.line, lty=4)
}

dev.off()

# Construct demand curves for analysis
price.grid = seq(-30, 180, by=price.reso)

# price.grid = seq(0, 60, by=1) # for testing
# An extra period dimension for weekends
d.kwh.pct_hat = array(NA, dim=c(length(price.grid), length(leaf.nums), length(period.list)+1),
                      dimnames = list(price.grid, leaf.nums, c(period.list, 'Weekend')))
# q_hat = copy(d.kwh.pct_hat)

for (i in 1:length(lm.rate.peak)){
  d.kwh.pct_hat[,i,'Peak'] = my.interp(lm.rate.peak[[i]], regex='period\\)', grid=price.grid, extrap=TRUE)$y
  d.kwh.pct_hat[,i,'Day'] = my.interp(lm.rate.day, regex='period\\)', grid=price.grid, extrap=TRUE)$y
  d.kwh.pct_hat[,i,'Night'] = my.interp(lm.rate.night, regex='period\\)', grid=price.grid, extrap=TRUE)$y
  d.kwh.pct_hat[,i,'Weekend'] = 0 # model weekend demand as completely unresponsive
} 
# Truncate extrapolated % changes at +/- 50%
trunc = 0.5
dims = dim(d.kwh.pct_hat)
for (i in 1:dims[1]) for (j in 1:dims[2]) for (k in 1:dims[3]) {
  if (d.kwh.pct_hat[i,j,k] > trunc) d.kwh.pct_hat[i,j,k] = trunc
  else if (d.kwh.pct_hat[i,j,k] < -trunc) d.kwh.pct_hat[i,j,k] = -trunc
}

# Model daytime demand as completely unresponsive
d.kwh.pct_hat[,,'Day'] = 0

# Model small-group demand (with the wrong-sign demand slopes) as completely unresponsive
d.kwh.pct_hat[,excluded.leaves,'Peak'] = 0

d.kwh.pct_hat[1:30,,]
range(d.kwh.pct_hat)

## ii) Determine real-time prices ----
# Read in SEM load & price data 
# Note that load data is in MW per half hour (so /2 to get MWh)
sem = fread(data_dir%&%'/Ireland SEM/SEM_Load_and_Prices.csv')
names(sem) <- tolower(names(sem))
sem[, .N, by=nchar(delivery_date)]
# sem[nchar(delivery_date)==10,table(delivery_date)]
sem[nchar(delivery_date)<10, date:=as.Date(delivery_date, format='%d/%m/%y')]
sem[nchar(delivery_date)==10, date:=as.Date(delivery_date, format='%d/%m/%Y')]
sem[, table(date)]
sem[, table(date)][sem[, table(date)]!=48] # what days are incomplete?
# 1/1/09 and 1/1/11 are ok (beginning and end of data)
# 3/29/09 3/28/09, 10/25/09, 10/31/10 are ok (daylight savings)
# not sure not sure why we're missing data on 8/1/09, 6/24/10-6/27/10.
(365+366)*48
sem[, load:=as.numeric(gsub(',','',system_load))]
sem[, table(delivery_hour)]
sem[, delivery_hour:= delivery_hour-1 ] #1->0, 24->23
sem[, half.hr:= delivery_hour*2 + delivery_interval]
sem = sem[between(date, '2009-01-01', '2010-12-31'), .(date, half.hr, load, smp, shadow_price)]

# Convert SMP to euro cents per kwh:
# (EUR/MWH * (1 MWH/1000 KWH) * (100 CENTS/1 EUR) = EUR/MWH * 100/1000 = EUR/MWH * 1/10)
sem[, smp.kwh := smp/10]
sem[, shadow.price.kwh := shadow_price/10]
sem[, t:=year(date)+(month(date)-1)/12+(day(date)-1)/(365.25)+half.hr/(365.25*48)]

dt.reg[, sum(n_residents)] # 8200 people (6700 adults)

# Pct
100*dt.reg[, sum(n_residents)]/(4.56*1e6) # 0.18% (1 in 560) of Irish population in experiment.
100*dt.reg[any.trt==1, .N]/(1.674290*1e6) # 0.14% (1 in 719) of Irish households treated
scaling.fac = 1/(dt.reg[any.trt==1, .N]/(1.674290*1e6))
100*dt.reg[any.trt==1, sum(n_residents)]/(4.560154*1e6) # 0.14% (710) of Irish population treated
1/0.00139; 1/0.001409
# number of households in Ireland = 1.674 mln in 2010: http://w3.unece.org/PXWeb2015/pxweb/en/STAT/STAT__30-GE__02-Families_households/08_en_GEFHPrivHouse_r.px/table/tableViewLayout1/?rxid=f6e0f1ea-e82e-4bf2-a08c-44fb47295f48
# (1.689 in 2015)
# population of ireland = 4.560 mln in 2010: http://w3.unece.org/PXWeb2015/pxweb/en/STAT/STAT__30-GE__01-Pop/001_en_GEPOAGESEX_REG_r.px/table/tableViewLayout1/?rxid=1a8613ae-c4f8-46a4-9afd-ce17384a2846
# (4.677 in 2015)

## Find half-hourly load to compute revenue-neutral price uplift needed ----
# This is the ratio of load to
# fixed_price*sum(load) = uplift*sum(wholesale.price*load)
# => uplift = fixed_price*sum(load) /  sum(wholesale.price*load)
load(file=output_dir%&%'/consumption_panel.Rdata')
load(file=output_dir%&%'/consumption_panel_nonweekday.Rdata')
dt.temp = rbindlist(list(dt[date>='2010-01-01', .(id,date, half.hr, kwh)], 
                         dt.nonweekday[date>='2010-01-01', .(id,date,half.hr, kwh)]))
dt.temp = dt.temp[, .(kwh=sum(kwh)), by=.(date, half.hr)]
dt.temp = dt.temp[order(date, half.hr)]
rm(dt); rm(dt.nonweekday)
gc()
dt.temp = merge(dt.temp, sem, by=c('date', 'half.hr'))

# Check load
# According to https://en.wikipedia.org/wiki/Energy_in_Ireland
# 2009 saw 26.9 TWh of electricity
# 26.9 TWh = 26.9 billion kwh
26.9*1e12/1e3/1e9 # blns of kwh
dt.temp[, sum(load)/2/1e6] # load/2 since it is load (in MW) measured every half-hour.
# This gets 35 billion kwh, close enough

lambda.sv.load = dt.temp[, 14.1*sum(load)/sum(shadow.price.kwh*load)]
lambda.sv.sample = dt.temp[, 14.1*sum(kwh)/sum(shadow.price.kwh*kwh)]
lambda.smp.load = dt.temp[, 14.1*sum(load)/sum(smp.kwh*load)]
lambda.smp.sample = dt.temp[, 14.1*sum(kwh)/sum(smp.kwh*kwh)]

dt.temp[, 14.1/100*sum(load/2*1000)]/1e6 # millions of euros
dt.temp[, lambda.sv.load*sum(shadow.price.kwh/100*load/2*1000)]/1e6

dt.temp[, 14.1/100*sum(kwh)]/1e6 # millions of euros 
dt.temp[, lambda.sv.sample*sum(shadow.price.kwh/100*kwh)]/1e6

# Use sample uplift/lambda (not overall load lambda) to be consistent with data. 
sem[, rtp.sv := shadow.price.kwh * lambda.sv.sample]
sem[, rtp.smp := smp.kwh * lambda.smp.sample]
sem

# left off here
### iii) Find consumption under real time pricing ----
# Choosing January 4, 2010, since that was a high load/high spot price day
load(file=output_dir%&%'/consumption_panel.Rdata')
dt.all = dt[date=='2010-01-04', .(id, period, rate, tar_stim, date, half.hr, kwh, trt.period)]
rm(dt); gc()
dt.all = dt.all[tar_stim!='EE']
dt.all = merge(dt.all, leaf.locs, by='id', all=FALSE) # this weeds out the id's not in the surveys
dt.all[, c('trt.period', 'tar_stim'):=NULL]

dt.all = merge(dt.all, 
               sem[date>='2010-01-01',.(date, half.hr, smp.kwh, shadow.price.kwh, rtp.sv, rtp.smp)],
               by=c('date','half.hr'))
dt.all = dt.all[order(id, date, half.hr)]
cor(dt.all[, .(smp.kwh, shadow.price.kwh)])

dt.all

# Create vectorized functions to find nearest matches (nearest.idx.vec, for numerics) 
# and exact matches (which.vec, for characters)
round.to.nearest = function(x, lvl) (round(x/lvl)*lvl)

dt.all[, range(rate)]
dt.all[, range(smp.kwh)]
range(price.grid)

dt.all[, rate.round := as.character(round.to.nearest(rate, price.reso))]
dt.all[, sv.round := as.character(round.to.nearest(shadow.price.kwh, price.reso))] # sv is actual sv (in cents/kwh)

ind.rate = dt.all[, array( c(rate.round, leaf.num, period), dim=c(.N, 3))]; gc()
ind.sv = dt.all[, array( c(sv.round, leaf.num, period), dim=c(.N, 3))]; gc()
ind.flat = dt.all[, array( c(rep('14', .N), leaf.num, period), dim=c(.N, 3))]; gc()

# Gross change under 5 different price cases (TOU rate (actual), smp, shadow value, scaled smp, flat rate)
dt.all[, d.kwh.pct.rate := d.kwh.pct_hat[ind.rate]]; gc()
dt.all[, d.kwh.pct.sv := d.kwh.pct_hat[ind.sv]]; gc()
dt.all[, d.kwh.pct.flat := d.kwh.pct_hat[ind.flat]]; gc()

rm(list=c('ind.rate','ind.sv','ind.flat')); gc()

# Net change relative to actual (rate is zero change from actual TOU)
dt.all[, d.kwh.pct.sv := d.kwh.pct.sv - d.kwh.pct.rate]
dt.all[, d.kwh.pct.flat := d.kwh.pct.flat - d.kwh.pct.rate]

# Consumption (kwh) under alternative pricing
dt.all[, kwh.sv := kwh*(1+d.kwh.pct.sv)]
dt.all[, kwh.flat := kwh*(1+d.kwh.pct.flat)]
dt.all[, c('d.kwh.pct.rate', 'd.kwh.pct.flat', 'rate.round') := NULL]

dt.sim = dt.all[, .(kwh=sum(kwh), kwh.flat=sum(kwh.flat), kwh.sv=sum(kwh.sv)), 
                by=.(date, half.hr)]

dt.sim = merge(dt.sim, sem, by=c('date','half.hr'))

# Population stats
N_hh = dt.all[, uniqueN(id)]
ppl.per.hh = dt.reg[any.trt==1, sum(n_residents)/.N]
N = N_hh*ppl.per.hh
Ire.Pop = 4735395  # in 2016. http://www.worldometers.info/world-population/ireland-population/

# par(mfrow=c(2,1))
# Graph Consumption and real time prices (Figure A.13) 
if (write.out==TRUE) pdf(file=graph_dir%&%'/RT-Simulation-Example-NoUplift-'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,1,0.4,1)) # for pdf output
par(cex.lab=1.785)
my.cols = c('green','red','blue')
# Plot consumption by scenario
plot(kwh.sv ~ I(half.hr/2), dt.sim, type='l', col='red', lwd=2, ylim=c(0,2000), cex.lab=1.75, cex.axis=1.75,
     main='', xlab='Hour of Day', xaxt='n', ylab='Consumption (kWh)')
axis(1, at=seq(0,24,by=2), cex.axis=1.75)
lines(kwh ~  I(half.hr/2), dt.sim, col='green', lwd=2, lty=1, type='l')
lines(kwh.flat ~  I(half.hr/2), dt.sim, col='blue', lwd=2, lty=1, type='l')
abline(v=range(peak.periods), lty=3) #; abline(v=0)
legend(x=0, y=2000, legend=c('kWh, Flat 14.1 cents/kWh (left)','kWh, TOU Pricing (left)','kWh, Real Time Pricing (left)'),
       lwd=2, lty=c(1,1,1), bty='n', col=c('blue','green','red'), cex=1.5)

par(new=TRUE)
ygrid=c(0,60)

# Plot prices
plot(shadow.price.kwh ~  I(half.hr/2), dt.sim, type='l', ylim=ygrid, col='gray20',
     lwd=3, ylab='', xlab='', yaxt='n', xaxt='n', cex.lab=1.75, lty=5)
abline(v=range(peak.periods)/2, lty=3)
abline(h=14.1, lty=3, lwd=2, col='black')
axis(4, at = seq(min(ygrid),max(ygrid),by=10), cex.axis=1.75)
mtext('cents/kWh', side = 4, line = 3, cex = par("cex.lab"))
legend(x= 0, y=44, legend='14.1 cents/kWh Fixed Tariff (right)', col='gray20', lty=3, lwd=2, bty='n', cex=1.5)
legend(x= 0, y=40, legend='Marginal Price (right)', col='gray20', lty=5, lwd=2, bty='n', cex=1.5)
dev.off()

#### 4) Calculate response by tariff group (Table A.2) ----
load(file=output_dir%&%'/consumption_panel.Rdata')
dt = dt[, .(id, period, half.hr, date, tar_stim, kwh, trt.period)]
date.week = data.table(date=dt[, unique(date)])
date.week[, wk:=year(date)%&%'-'%&%week(date)]
dt = merge(dt, date.week, by='date')
dt[, tar:=substr(tar_stim, 1, 1)]
dt[, tar:=relevel(factor(tar), ref='E')]
dt; dt[, uniqueN(id)]

gc()
fe.rate.day.all = felm(log(kwh+0.0001) ~ tar*trt.period | id+wk | 0 | id+wk, data=dt[period=='Day']); gc()
fe.rate.night.all = felm(log(kwh+0.0001) ~ tar*trt.period | id+wk | 0 | id+wk, data=dt[period=='Night']); gc()
fe.rate.peak.all = felm(log(kwh+0.0001) ~ tar*trt.period | id+wk | 0 | id+wk, data=dt[period=='Peak'])
rm(dt)
gc()

summary(fe.rate.day.all)
summary(fe.rate.night.all)
summary(fe.rate.peak.all)

# Output Table A.2
stargazer(list(fe.rate.peak.all, fe.rate.day.all, fe.rate.night.all), out=output_dir%&%'Tables/TE-by-Tariff_'%&%today()%&%'.tex')

# Run F tests
R = matrix(NA, nrow=3, ncol=4)
R[1,] = c(1,-1,0,0)
R[2,] = c(0,1,-1,0)
R[3,] = c(0,0,1,-1)

# Tests for all responses equal
waldtest(fe.rate.peak.all, R=cbind(matrix(0,nrow=3, ncol=5),R), r=rep(0,3), type='cluster') # p=0.08
waldtest(fe.rate.day.all, R=cbind(matrix(0,nrow=3, ncol=5),R), r=rep(0,3), type='cluster') # p=0.37
waldtest(fe.rate.night.all, R=cbind(matrix(0,nrow=3, ncol=5),R), r=rep(0,3), type='cluster') # p=0.12

# Test for individual differences
# A vs B. p=0.26
waldtest(fe.rate.peak.all, R=cbind(matrix(0,nrow=1, ncol=5),matrix(c(1,-1,0,0), nrow=1, ncol=4)), r=0, type='cluster') 
# B vs C. p=0.72
waldtest(fe.rate.peak.all, R=cbind(matrix(0,nrow=1, ncol=5),matrix(c(0,1,-1,0), nrow=1, ncol=4)), r=0, type='cluster') 
# C vs D. p=0.08
waldtest(fe.rate.peak.all, R=cbind(matrix(0,nrow=1, ncol=5),matrix(c(0,0,1,-1), nrow=1, ncol=4)), r=0, type='cluster') 
# A vs D. p=0.01
waldtest(fe.rate.peak.all, R=cbind(matrix(0,nrow=1, ncol=5),matrix(c(1,0,0,-1), nrow=1, ncol=4)), r=0, type='cluster') 

# How big of an effect would the 26 cent tier have to be to be statistically detectible?
B = fe.rate.peak.all$coefficients[6:9]
V = vcov(fe.rate.peak.all)[6:9, 6:9]

constant.elast.effect = B[1]*(26-14.1)/(20-14.1)
constant.elast.effect # if constant elasticity, need -14% effect at tariff B
B[2]-1.96*sqrt(V[2,2]) # -11% is lower bound of 95% CI. So we can reject -14%

d.SE = sqrt(V[1,1]+V[2,2]-2*V[1,2]) # SE of B1-B2
# can detect coefficient +2 SEs
detectible.effect = B[1]-1.96*d.SE # effect must be larger than this for it to be detectible
detectible.effect # point estimate would have to be -10% or greater to detect
detectible.effect/B[1] # can detect a 40% higher effect from a 100% price increase.
B[2]/B[1] # actual effect is only 25% larger.

# ran code to here // left off here
#### 5) Honest Tree ----
ids.all = dt.reg.dupe[, unique(id)]
set.seed(5) # very balanced groups (-8.87% and -8.90%) (also 2, 5)
train.idx = sample(1:length(ids.all), size=length(ids.all)/2, replace=FALSE)
train.ids = ids.all[train.idx]
est.ids = ids.all[-train.idx]

# Subset data
train.data = dt.reg.dupe[id %in% train.ids]
est.data = dt.reg.dupe[id %in% est.ids]
dim(train.data); dim(est.data)

# Construct treatment vectors for training and estimation data
train.trt = train.data[, any.trt]
est.trt = est.data[, any.trt]
length(train.trt); length(est.trt)

# Construct CV group assignment vectors for training data
train.cv.grps = train.data[, cv.grps]
est.cv.grps = est.data[, cv.grps]

# Check
train.data[any.trt==1, mean(d.kwh.Peak)]-train.data[any.trt==0, mean(d.kwh.Peak)]
est.data[any.trt==1, mean(d.kwh.Peak)]-est.data[any.trt==0, mean(d.kwh.Peak)]

train.data[train.trt==1, mean(d.kwh.Peak)]-train.data[train.trt==0, mean(d.kwh.Peak)]
est.data[est.trt==1, mean(d.kwh.Peak)]-est.data[est.trt==0, mean(d.kwh.Peak)]

# Remove extra variables not to be used in the trees
train.data.temp = train.data[, -c('id', 'any.trt', 'cv.grps', 'prop.score',
                                  'rate.Peak_trt.period', 'rate.Day_trt.period', 'rate.Night_trt.period'), with=FALSE]
est.data.temp = est.data[, -c('id', 'any.trt', 'cv.grps', 'prop.score',
                              'rate.Peak_trt.period', 'rate.Day_trt.period', 'rate.Night_trt.period'), with=FALSE]
dim(train.data); dim(est.data)

# Run honest trees
c.tree.hon.peak = honest.causalTree(d.kwh.Peak ~ . - d.kwh.Day - kwh.Day_base - d.kwh.Night - kwh.Night_base,
                                    data=train.data.temp, treatment=train.trt, est_data=est.data.temp, est_treatment=est.trt,
                                    split.Rule='TOT', cv.option='TOT', xval=train.cv.grps, minsize=0.02*dt.reg[, .N]) #, 
c.tree.hon.night = honest.causalTree(d.kwh.Night ~ . - d.kwh.Day - kwh.Day_base - d.kwh.Peak - kwh.Peak_base,
                                     data=train.data.temp, treatment=train.trt, est_data=est.data.temp, est_treatment=est.trt,
                                     split.Rule='TOT', cv.option='TOT', xval=train.cv.grps, minsize=0.02*dt.reg[, .N])
c.tree.hon.day = honest.causalTree(d.kwh.Day ~ . - d.kwh.Night - kwh.Night_base - d.kwh.Peak - kwh.Peak_base,
                                   data=train.data.temp, treatment=train.trt, est_data=est.data.temp, est_treatment=est.trt,
                                   split.Rule='TOT', cv.option='TOT', xval=train.cv.grps, minsize=0.02*dt.reg[, .N]) 
c.tree.hon.peak = replace.proportions(c.tree.hon.peak, data=est.data)
c.tree.hon.night = replace.proportions(c.tree.hon.night, data=est.data)
c.tree.hon.day = replace.proportions(c.tree.hon.day, data=est.data)

par(mfrow=c(1,3), mai=c(1,1,0.5,0.5))
plot.pruned.tree(c.tree.hon.peak, data=train.data, digits=3, nsplits=NULL, main='Peak') 
plot.pruned.tree(c.tree.hon.night, data=train.data, digits=3, nsplits=NULL, main='Night') 
plot.pruned.tree(c.tree.hon.day, data=train.data, digits=3, nsplits=NULL, main='Day') 

plotcp(c.tree.hon.peak)
plotcp(c.tree.hon.night)
plotcp(c.tree.hon.day)

# Plot full data tree vs honest tree
par(mfrow=c(1,2), mai=c(1,1,0.5,0.5))
plot.pruned.tree(c.tree.peak, data=dt.reg.dupe, nsplits=NULL, digits=3, main='Treatment Effect (Peak Hours) - Full Data')
plot.pruned.tree(c.tree.hon.peak, data=train.data, nsplits=NULL, digits=3, main='Treatment Effect (%, Peak Hours) - Honest')

plot.pruned.tree(c.tree.night, data=dt.reg.dupe, nsplits=NULL, digits=3, main='Treatment Effect (Night Hours) - Full Data')
plot.pruned.tree(c.tree.hon.night, data=train.data, nsplits=NULL, digits=3, main='Treatment Effect (%, Night Hours) - Honest')

c.tree.hon.peak.prune = opt.prune(c.tree.hon.peak)

# Estimate SEs of Tree
tree.regs.peak.hon = tree.SEs(tree=c.tree.hon.peak.prune, data=est.data, yvar='d.kwh.Peak')
c.tree.hon.peak.prune$frame = tree.regs.peak.hon[[1]]$frame

tree = copy(c.tree.hon.peak.prune)

default.labs.hon = rpart.plot:::internal.split.labs(tree, type=2, clip.left.labs = FALSE, clip.right.labs = FALSE, 
                                                    xflip=FALSE, digits=2, varlen= 25, faclen=3, facsep=",", 
                                                    eq=" = ", lt = " < ", ge = " >= ", split.prefix = "",
                                                    right.split.prefix = NULL, split.suffix = "", right.split.suffix = NULL)
default.labs.hon

# Plot honest tree
split.labs.hon <- function(x, labs, digits, varlen, faclen) {
  labs[grep('tariff_change >= 0.5', labs)] = 'Aware of Tariff Change?'
  labs[grep('kwh.Peak_base >= 0.22', labs)] = 'Baseline Average Peak \n Consumption >= 0.22 kWh (18th pctile)'
  labs[grep('stim.bm.bill.ihd >= 0.5', labs)] = 'Info. Treatment: \n In-Home Display?' 
  labs
}
split.labs.hon(tree, default.labs.hon, digits=2, varlen=-8, faclen=3)

if (write.out==TRUE) pdf(file=graph_dir%&%'/Peak-Tree-Honest-'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.85,0.4,0.15)) # for pdf output
prp.defaults(tree, split.fun=split.labs.hon, cex=1.15, split.cex=0.875, tweak=0.95)
dev.off()

est.data[, .N, by=any.trt]
dt.reg[, quantile(kwh.Peak_base, probs=(1:10)/10)]
dt.reg[, mean(kwh.Peak_base<=0.21552)]
dt.reg[any.trt==1, mean(kwh.Peak_base<=0.21552)]
est.data[any.trt==1, mean(kwh.Peak_base<=0.21552)]
dt.reg[any.trt==1, mean(kwh.Peak_base>0.253)]

#### 6) Differential effects by temperature and electric devices -----
# Read in temp
ire.temp = fread(data_dir%&%'/Weather/phoenix_park_temps.csv')
ire.temp[, date:=as.Date(date)]
# one missing obs:
ire.temp[date=='2010-11-24' & hour==13, ]
temp.interp = sapply(ire.temp[date=='2010-11-24' & hour %in% c(12,14),.(wetbulbtemp, temp)], mean)
ire.temp[date=='2010-11-24' & hour==13, wetbulbtemp:=temp.interp[1] ]
ire.temp[date=='2010-11-24' & hour==13, temp:=temp.interp[2] ]
ire.temp[,.N]; ire.temp[complete.cases(ire.temp), .N]

cor(ire.temp[, .(wetbulbtemp, temp)]) # almost perfect correlation
sapply(ire.temp, class)
# Note: weather data is in UTC, electricity data is local time. add 1 hour
ire.temp[, hour:=hour+1]
ire.temp[, range(hour)]
# temperature data is an instantaneous reading. link it to subsequent hour in dt
# so if half hour is 47 or 48, link it to hour 23 (which is the reading at 11pm)
ire.temp

## Set up data and merge to temperatures
load(output_dir%&%'/consumption_panel.Rdata')
dt.fe.reg = copy(dt[period=='Peak'])
rm(dt); gc()
dt.fe.reg[, hour:=ceiling(half.hr/2)-1]
# this -1 shifts the hour back 1, which is the relevant time for the temperature data
# e.g, if half hour is 47 or 48, link it to hour 23 (which is the reading at 11pm)
# problem: there is now an hour 0 for half hours 1 and 2. This should correspond to the previous day's
# 24th hour. So we need to change the merge date to match the previous day.
dt.fe.reg[date=='2009-07-15' & id=='1003']
dt.fe.reg[, date.merge:=date]
dt.fe.reg[hour==0, date.merge:=date-1]
dt.fe.reg[hour==0, hour:=24]

dt.fe.reg[,.N]
dt.fe.reg = merge(dt.fe.reg, ire.temp[, .(date, hour, wetbulbtemp, temp)], 
                  by.x=c('date.merge','hour'), by.y=c('date','hour'), all.x=TRUE, all.y=FALSE)
dt.fe.reg[,.N]
dt.fe.reg = dt.fe.reg[order(id, date, half.hr)]
dt.fe.reg[, date.merge:=NULL]
dt.fe.reg[, hour:=NULL]
gc()
# done merging temperature
dt.fe.reg = merge(dt.fe.reg, 
                  survey[, .(id, d_aware_of_tariff_change, d_heat_elec_central, 
                             d_heat_elec_plugin, d_waterheater_elec_immersion, 
                             d_waterheater_elec_instant, d_immersion, d_elec_cook,
                             d_washing_machine, d_dishwasher, d_tumble_dryer)], by='id')
dt.fe.reg[, uniqueN(id)] # checking the number of people
# Make "any elec heat" variable
dt.fe.reg[, d_heat_elec_any := 0]
dt.fe.reg[d_heat_elec_plugin==1,  d_heat_elec_any := 1]
dt.fe.reg[d_heat_elec_central==1, d_heat_elec_any := 1]

# Make "any elec water heat" variable
dt.fe.reg[, d_waterheat_elec_any := 0]
dt.fe.reg[d_waterheater_elec_immersion==1,  d_waterheat_elec_any := 1]
dt.fe.reg[d_waterheater_elec_instant==1, d_waterheat_elec_any := 1]
dt.fe.reg[d_immersion==1, d_waterheat_elec_any := 1]
dt.fe.reg[, .N, by=.(d_immersion, d_waterheater_elec_immersion, d_waterheater_elec_instant)]

gc()

## Now create temperature bins
temp.breaks = seq(-7, 20, by=3)
dt.fe.reg[, wetbulb.bin := cut(wetbulbtemp, breaks=temp.breaks)]
wetbulb.bin.levels = dt.fe.reg[, levels(wetbulb.bin)]
dt.fe.reg[, wetbulb.bin:=relevel(wetbulb.bin, '(-1,2]')] # all results will be relative to -1 to 2 C bin.
base.bin = '(-1,2]'

## plot histogram of peak temperatures
# keep only 1 household (temperature measure is the same across them)
# keep only odd half hours (since even half hours are just repeated hourly data)
# drop the 5 hours with temperatures below -4 degrees
# (we drop them below bc not enough variation to estimate)
x = copy(dt.fe.reg[id==1002][half.hr %in% c(35,37)]) 
x = x[, .(date, half.hr, wetbulbtemp, wetbulb.bin)]
gc()
hist(x[,wetbulbtemp])
x = x[,.N, by=wetbulb.bin]
x = x[.N:1] # reverse order for plot (coldest to hottest)
x = x[-1] # drop coldest bin with insufficient variation
x[, N.share := N/sum(N)]
x[, sum(N.share)]
x = x[c(1:5,7,8,6)] # manually reorder into ascending order

if (write.out) pdf(graph_dir%&%'TEs_Temp_Bins_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
barplot(x[,N.share], names.arg=x[,wetbulb.bin], ylim=c(0,0.25), cex.axis=1.5, cex.lab=1.5, cex.names=1.5,
        ylab='Share of Peak Hours', xlab='Temperature Bin (Degrees Celsius)')
grid(col='gray70', ny=NULL, nx=NA)
abline(h=0)
dev.off()

fe.fit.wetbulb.all = felm(log(kwh+0.0001) ~ any.trt*trt.period*wetbulb.bin
                          | id+mon | 0 | id+mon, data=dt.fe.reg) #[sample(.N, .N/1000)])
summary.wetbulb = summary(fe.fit.wetbulb.all) 
vcov.wetbulb = vcov(fe.fit.wetbulb.all)
fe.fit.wetbulb = clear.fe(fe.fit.wetbulb.all)
# treatment effect is no different during cold hours (any.trt:trt.period:cold.bin effectively zero and insignif p=0.93)

# FE Regressions interacting temperature with appliance data
fe.fit.wetbulb = fe.sum.wetbulb = fe.vcov.wetbulb = fe.TEs.wetbulb =
  array(list(), dim=c(length(wetbulb.bin.levels), 1+6*2), 
        dimnames=list(wetbulb.bin.levels, 
                      c('All','Heat','NoHeat','WHeat','NoWHeat','Cook','NoCook',
                        'Dryer','NoDryer','CWasher','NoCWasher','DWasher','NoDWasher')))

wetbulb.bin.levels
dt.fe.reg[, wk:=year(date)%&%'-'%&%week(date)]
dt.fe.reg
for (i in wetbulb.bin.levels) {
  dt.temp = dt.fe.reg[wetbulb.bin==i]
  if (dt.temp[,uniqueN(trt.period)==1]) next # skip if no control period obs. (or if no trt period obs)
  fe.fit.wetbulb[i,1][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp)
  fe.fit.wetbulb[i,2][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_heat_elec_any==1])
  fe.fit.wetbulb[i,3][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_heat_elec_any==0])
  fe.fit.wetbulb[i,4][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_waterheat_elec_any==1])
  fe.fit.wetbulb[i,5][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_waterheat_elec_any==0])
  fe.fit.wetbulb[i,6][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_elec_cook==1])
  fe.fit.wetbulb[i,7][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_elec_cook==0])
  fe.fit.wetbulb[i,8][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_tumble_dryer==1])
  fe.fit.wetbulb[i,9][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_tumble_dryer==0])
  fe.fit.wetbulb[i,10][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_washing_machine==1])
  fe.fit.wetbulb[i,11][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_washing_machine==0])
  fe.fit.wetbulb[i,12][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_dishwasher==1])
  fe.fit.wetbulb[i,13][[1]] = felm(log(kwh+0.0001) ~ any.trt:trt.period | id+wk | 0 | id+wk, data=dt.temp[d_dishwasher==0])
  
  for (j in 1:dim(fe.fit.wetbulb)[2]) {
    fe.sum.wetbulb[i,j][[1]] = summary(fe.fit.wetbulb[i,j][[1]])
    fe.vcov.wetbulb[i,j][[1]] = vcov(fe.fit.wetbulb[i,j][[1]])
    fe.fit.wetbulb[i,j][[1]] = clear.fe(fe.fit.wetbulb[i,j][[1]])
    fe.TEs.wetbulb[i,j][[1]] = fe.sum.wetbulb[i,j][[1]]$coefficients
  }
  message(paste(i,'done'))
  gc()
}

# Convert results to matrices
fe.TEs.wetbulb.mat = array(NA, dim=c(length(wetbulb.bin.levels)-1, 4, length(dimnames(fe.fit.wetbulb)[[2]])),
                           dimnames=list(wetbulb.bin.levels[-1], 
                                         colnames(fe.TEs.wetbulb[i,j][[1]]),
                                         dimnames(fe.fit.wetbulb)[[2]]))
for (j in 1:dim(fe.fit.wetbulb)[2]) {
  fe.TEs.wetbulb.mat[,,j] = matrix(unlist(fe.TEs.wetbulb[,j]), byrow=TRUE, nrow=length(wetbulb.bin.levels)-1)
}

dimnames(fe.fit.wetbulb)

wetbulb.bin.levels.plot = dimnames(fe.TEs.wetbulb.mat)[1]
bins = length(wetbulb.bin.levels.plot[[1]])

# Create nice plotting functions
plot.CI = function(mat, col, j, plot.line=TRUE, plot.se=TRUE, ...) {
  if (plot.line) lines(mat[,'Estimate',j], type='b', lwd=3, col=col, ...)
  if (plot.se) lines(mat[,'Estimate',j]-1.96*fe.TEs.wetbulb.mat[,'Cluster s.e.',j], type='l', lwd=2, col=col, lty=3)
  if (plot.se) lines(mat[,'Estimate',j]+1.96*fe.TEs.wetbulb.mat[,'Cluster s.e.',j], type='l', lwd=2, col=col, lty=3)
}

plot.CI.bars = function(mat, col, j, plot.line=TRUE, plot.se=TRUE, shft=0, ...) {
  if (plot.line) points(y=mat[,'Estimate',j], x=1:bins+shft, lwd=3, col=col, cex=1.4, ...)
  if (plot.se) {
    segments(1:bins+shft, y0=mat[,'Estimate',j]-1.96*fe.TEs.wetbulb.mat[,'Cluster s.e.',j],
             y1=mat[,'Estimate',j]+1.96*fe.TEs.wetbulb.mat[,'Cluster s.e.',j], 
             lwd=2, col=col, lty=1, cex=1.2)
    arrows(1:bins+shft, y0=mat[,'Estimate',j]-1.96*fe.TEs.wetbulb.mat[,'Cluster s.e.',j],
           y1=mat[,'Estimate',j]+1.96*fe.TEs.wetbulb.mat[,'Cluster s.e.',j], 
           lwd=2, col=col, lty=1, angle=90, code=3, length=0.05, cex=1.2)
  }
}

# Points w/ bars
# All
if (write.out) pdf(graph_dir%&%'TEs_Temp_Binned_All_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
plot(y=fe.TEs.wetbulb.mat[,'Estimate','All'], x=1:bins, type='p', ylim=c(-0.4,0.1), xlim=c(0.5,bins+0.5), xaxt='n', lwd=3,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')),
     xlab='Temperature Bin (Degrees Celsius)', col='black', 
     cex=1.4, cex.lab=1.5, cex.axis=1.5)
grid(col='gray70'); abline(h=0)
axis(1, at=1:bins, labels=wetbulb.bin.levels[-1], cex.axis=1.5)
plot.CI.bars(fe.TEs.wetbulb.mat, j='All', col='black', plot.line=FALSE, plot.se=TRUE, shft=0)
dev.off()

# Home heat
if (write.out) pdf(graph_dir%&%'TEs_Temp_Binned_Heat_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
plot(y=fe.TEs.wetbulb.mat[,'Estimate','Heat'], x=1:bins - 0.1, type='p', ylim=c(-0.4,0.1), xlim=c(0.5,bins+0.5), xaxt='n', lwd=3,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')),
     xlab='Temperature Bin (Degrees Celsius)', col='red', 
     cex=1.4, cex.lab=1.5, cex.axis=1.5)
grid(col='gray70'); abline(h=0)
axis(1, at=1:bins, labels=wetbulb.bin.levels[-1], cex.axis=1.5)
plot.CI.bars(fe.TEs.wetbulb.mat, j='Heat', col='red', plot.line=FALSE, plot.se=TRUE, shft=-0.1)
plot.CI.bars(fe.TEs.wetbulb.mat, j='NoHeat', col='blue', plot.line=TRUE, plot.se=TRUE, shft=0.1)
legend('topright', legend=c('Electric Home Heat','No Electric Home Heat'), 
       col=c('red','blue'), pch=c(1,1), lty=1:1, bty='n', cex=1.5, lwd=2)
dev.off()

# Water heat
if (write.out) pdf(graph_dir%&%'TEs_Temp_Binned_WaterHeat_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
plot(y=fe.TEs.wetbulb.mat[,'Estimate','WHeat'], x=1:bins - 0.1, type='p', ylim=c(-0.4,0.1), xlim=c(0.5,bins+0.5), xaxt='n', lwd=3,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')),
     xlab='Temperature Bin (Degrees Celsius)', col='red', 
     cex=1.4, cex.lab=1.5, cex.axis=1.5)
grid(col='gray70'); abline(h=0)
axis(1, at=1:bins, labels=wetbulb.bin.levels[-1], cex.axis=1.5)
plot.CI.bars(fe.TEs.wetbulb.mat, j='WHeat', col='red', plot.line=FALSE, plot.se=TRUE, shft=-0.1)
plot.CI.bars(fe.TEs.wetbulb.mat, j='NoWHeat', col='blue', plot.line=TRUE, plot.se=TRUE, shft=0.1)
legend('topright', legend=c('Electric Water Heat','No Electric Water Heat'), 
       col=c('red','blue'), pch=c(1,1), lty=1:1, bty='n', cex=1.5, lwd=2)
dev.off()

# Elec Cook
if (write.out) pdf(graph_dir%&%'TEs_Temp_Binned_Cook_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
plot(y=fe.TEs.wetbulb.mat[,'Estimate','Cook'], x=1:bins - 0.1, type='p', ylim=c(-0.4,0.1), xlim=c(0.5,bins+0.5), xaxt='n', lwd=3,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')),
     xlab='Temperature Bin (Degrees Celsius)', col='red', 
     cex=1.4, cex.lab=1.5, cex.axis=1.5)
grid(col='gray70'); abline(h=0)
axis(1, at=1:bins, labels=wetbulb.bin.levels[-1], cex.axis=1.5)
plot.CI.bars(fe.TEs.wetbulb.mat, j='Cook', col='red', plot.line=FALSE, plot.se=TRUE, shft=-0.1)
plot.CI.bars(fe.TEs.wetbulb.mat, j='NoCook', col='blue', plot.line=TRUE, plot.se=TRUE, shft=0.1)
legend('topright', legend=c('Electric Cook Stove','No Electric Cook Stove'), 
       col=c('red','blue'), pch=c(1,1), lty=1:1, bty='n', cex=1.5, lwd=2)
dev.off()

# Tumbler Dryer
if (write.out) pdf(graph_dir%&%'TEs_Temp_Binned_Dryer_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
plot(y=fe.TEs.wetbulb.mat[,'Estimate','Dryer'], x=1:bins - 0.1, type='p', ylim=c(-0.4,0.1), xlim=c(0.5,bins+0.5), xaxt='n', lwd=3,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')),
     xlab='Temperature Bin (Degrees Celsius)', col='red', 
     cex=1.4, cex.lab=1.5, cex.axis=1.5)
grid(col='gray70'); abline(h=0)
axis(1, at=1:bins, labels=wetbulb.bin.levels[-1], cex.axis=1.5)
plot.CI.bars(fe.TEs.wetbulb.mat, j='Dryer', col='red', plot.line=FALSE, plot.se=TRUE, shft=-0.1)
plot.CI.bars(fe.TEs.wetbulb.mat, j='NoDryer', col='blue', plot.line=TRUE, plot.se=TRUE, shft=0.1)
legend('topright', legend=c('Tumble Dryer','No Tumble Dryer'), 
       col=c('red','blue'), pch=c(1,1), lty=1:1, bty='n', cex=1.5, lwd=2)
dev.off()

# Clothes Washer (note: standard errors on "no washing machine" are too huge to be informative)
if (write.out) pdf(graph_dir%&%'TEs_Temp_Binned_CWasher_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
plot(y=fe.TEs.wetbulb.mat[,'Estimate','CWasher'], x=1:bins - 0.1, type='p', ylim=c(-0.4,0.1), xlim=c(0.5,bins+0.5), xaxt='n', lwd=3,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')),
     xlab='Temperature Bin (Degrees Celsius)', col='red', 
     cex=1.4, cex.lab=1.5, cex.axis=1.5)
grid(col='gray70'); abline(h=0)
axis(1, at=1:bins, labels=wetbulb.bin.levels[-1], cex.axis=1.5)
plot.CI.bars(fe.TEs.wetbulb.mat, j='CWasher', col='red', plot.line=FALSE, plot.se=TRUE, shft=-0.1)
plot.CI.bars(fe.TEs.wetbulb.mat, j='NoCWasher', col='blue', plot.line=TRUE, plot.se=TRUE, shft=0.1)
legend('topright', legend=c('Washing Machine','No Washing Machine'), 
       col=c('red','blue'), pch=c(1,1), lty=1:1, bty='n', cex=1.5, lwd=2)
dev.off()

# Dish Washer
if (write.out) pdf(graph_dir%&%'TEs_Temp_Binned_DWasher_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.95,0.4,0.15))
plot(y=fe.TEs.wetbulb.mat[,'Estimate','DWasher'], x=1:bins - 0.1, type='p', ylim=c(-0.4,0.1), xlim=c(0.5,bins+0.5), xaxt='n', lwd=3,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')),
     xlab='Temperature Bin (Degrees Celsius)', col='red', 
     cex=1.4, cex.lab=1.5, cex.axis=1.5)
grid(col='gray70'); abline(h=0)
axis(1, at=1:bins, labels=wetbulb.bin.levels[-1], cex.axis=1.5)
plot.CI.bars(fe.TEs.wetbulb.mat, j='DWasher', col='red', plot.line=FALSE, plot.se=TRUE, shft=-0.1)
plot.CI.bars(fe.TEs.wetbulb.mat, j='NoDWasher', col='blue', plot.line=TRUE, plot.se=TRUE, shft=0.1)
legend('topright', legend=c('Dishwasher','No Dishwasher'), 
       col=c('red','blue'), pch=c(1,1), lty=1:1, bty='n', cex=1.5, lwd=2)
dev.off()

#### 7)  Peak effect by week of treatment ----
# Load back in consumption panel, dt (dropped above for memory)
load(output_dir%&%'/consumption_panel.Rdata')
dt = dt[period=='Peak']
dt[, wk:=as.character(week(date))] # for interaction
dt[nchar(wk)==1, wk := paste0('0',wk)] # left pad single digit weeks so they sort correctly
dt[, wk_of_sample:=paste(year(date),wk)] # for FEs/clustering

# Interact treatment with week of sample
fe.fit.wkly = felm(log(kwh+0.0001) ~ any.trt:trt.period:wk_of_sample |
                       id+wk_of_sample | 0 | id, data=dt)
rm(dt)
fe.sum.wkly = summary(fe.fit.wkly)
fe.sum.wkly
fe.coef.wkly = fe.sum.wkly$coefficients
fe.coef.wkly = fe.coef.wkly[which(!is.na(fe.coef.wkly[,1])),]

# Plot treatment effect by week of year, along with 95% CIs
if (write.out==TRUE) pdf(file=graph_dir%&%'/Treatment-Effects-by-Week-'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mai=c(0.775,0.9,0.05,0.15), family='serif') # for pdf output
yrange = c(-0.2, 0.2)
ygrid = round(seq(yrange[1],yrange[2],by=0.05),2)

plot(fe.coef.wkly[,1], type='l', lwd=2, col='black', ylim=yrange, xaxt='n', yaxt='n', 
     xlab='Week of Year', ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')), cex.lab=1.5)
axis(1, at=seq(0,52,by=4), cex.axis=1.35)
axis(2, at=ygrid, cex.axis=1.35); 
abline(v=seq(0,52,by=4), h=ygrid, col='gray70', lty=3)
abline(h=0, lty=1, col='gray20')
lines(fe.coef.wkly[,1]+1.96*fe.coef.wkly[,2], type='l', col='black', lty=3, lwd=2)
lines(fe.coef.wkly[,1]-1.96*fe.coef.wkly[,2], type='l', col='black', lty=3, lwd=2)
legend('topright', legend=c('Estimated Treatment Effect', '95% Confidence Interval'), lty=c(1,3), lwd=2, cex=1.2, bty='n')
dev.off()

#### 8) Are low income households more price sensitive? ----
# How many refused to answer?
dt.income = copy(dt.reg)
dt.income[, table(f_hhincome, exclude=NULL)]
dt.income[, sum(is.na(f_hhincome1))]/dt.income[,.N] # 54% refused to answer
dt.income[, mean(is.na(f_hhincome1)), by=any.trt] # similar numbers refused in treatment/control
summary(lm(any.trt ~ is.na(f_hhincome1), data=dt.income)) # no signif diff

dt.income[, f_hhincome:=as.character(f_hhincome)]
dt.income[is.na(f_hhincome), f_hhincome:='Refused']

# Correlation of income and consumption
# Indeed, income is positively correlated with baseline consumption
summary(lm(kwh.Peak_base ~ -1 + relevel(factor(f_hhincome),'1'), data=dt.income))
summary(lm(kwh.Peak_base ~ relevel(factor(f_hhincome),'3'), data=dt.income))

# Check balance by income
summary(lm(any.trt ~ -1 + factor(f_hhincome), data=dt.income))
summary(lm(any.trt ~ relevel(factor(f_hhincome),'3'), data=dt.income))

# Check impact on treatment effect
# Choose reference level to be most frequent one, which is "refused to answer"
dt.income[,table(f_hhincome)]
summary(lm(d.kwh.Peak ~ any.trt*relevel(factor(f_hhincome),'Refused'), data=dt.income))
summary(lm(d.kwh.Peak ~ any.trt:relevel(factor(f_hhincome),'1'), data=dt.income))
summary(lm(d.kwh.Peak ~ any.trt*relevel(factor(f_hhincome),'1'), data=dt.income))

# Check price sensitivity for low vs high income
lm.inc.tar = lm(d.kwh.Peak ~ relevel(factor(f_hhincome),'1'):(tariff.A+tariff.B+tariff.C+tariff.D), data=dt.income)
round(matrix(cov2cor(vcov(lm.inc.tar)), nrow=25),1)
inc.tar = summary(lm.inc.tar)$coefficients

rownames(inc.tar) = gsub('\"','', rownames(inc.tar))
rownames(inc.tar) = gsub('relevel\\(factor\\(f_hhincome\\), 1\\)','', rownames(inc.tar))
rownames(inc.tar)
inc.tar = inc.tar[-1,] # drop intercept
inc.tar # reorder to put groups together
inc.tar = inc.tar[sort(rownames(inc.tar)),]

inc.tar = data.table(inc.tar,keep.rownames=T)
inc.tar[, c('inc','tar') := tstrsplit(rn, ':')]
inc.tar[, CI.lwr := Estimate - 1.96*`Std. Error`]
inc.tar[, CI.upr := Estimate + 1.96*`Std. Error`]

inc.tar[inc=='1', inc.lab := '<€15k']
inc.tar[inc=='2', inc.lab := '€15k - €30k']
inc.tar[inc=='3', inc.lab := '€30k - €50k']
inc.tar[inc=='4', inc.lab := '€50k - €75k']
inc.tar[inc=='5', inc.lab := '>€75k']
inc.tar[inc=='Refused', inc.lab := 'Refused']

inc.tar[, tar:=gsub('tariff\\.', '', tar)]

inc.tar[, inc.lab := factor(inc.lab, 
                            levels=c('<€15k','€15k - €30k','€30k - €50k',
                                     '€50k - €75k','>€75k','Refused'))]
dt.income[, .N/dt.income[,.N], by=f_hhincome]

my.cols = rainbow(4, start=0.5, end=1)

pdf(graph_dir%&%'/Treatment_Effect_by_tariff_and_income_'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mai=c(0.9,0.9,0.05,0.15), family='serif') # for pdf output
xlocs = 0 + 1:24 + floor((0:23)/4) # every 4 points there's a gap
plot(y=inc.tar[,Estimate], x=xlocs, col=my.cols[inc.tar[, as.factor(tar)]], ylim=c(-0.25,0.1),
     xlab='Income Group', xaxt='n', pch=19, cex=1.5, cex.lab=1.5, cex.axis=1.5,
     ylab=expression(paste('Treatment Effect (Log Points: ', Delta,'ln(',Y[i],'))')))
abline(h=0)
abline(v=seq(5,25,by=5), col='gray30')
grid(col='gray70', nx=NA, ny=NULL)
par(xpd=TRUE)
text(x=2.5+seq(0,29,by=5), y=-0.28, labels=inc.tar[,unique(inc.lab)], cex=1.25)
par(xpd=FALSE)
legend('topright', legend=c('Tariff '%&%toupper(letters[1:4])),
       fill=my.cols, horiz=F, bg='white', cex=1.45)
segments(x0=xlocs, y0=inc.tar[,CI.lwr], y1=inc.tar[,CI.upr], col=my.cols)
arrows(x0=xlocs+0.01, y0=inc.tar[,CI.lwr], y1=inc.tar[,CI.upr], angle=90, lwd=2, lty=1, length=0.05, code=3,
       col=my.cols)
# abline(h=-0.089)
dev.off()

#### 9) Effects on Bills/CS (Counterfactual) ----
# Predict TEs. Find counterfactual consumption by period. Compute new bill and surplus
# Zero out the statistically insigificant treatment effects
zero.out.insignif.leaves = function(tree) {
  tree.signif = tree
  insignif = which(tree.signif$frame[, 'pval']>0.05)
  tree.signif$frame[insignif, 'yval'] = 0
  return(tree.signif)
}

c.tree.peak.signif = zero.out.insignif.leaves(c.tree.peak)
c.tree.night.signif = zero.out.insignif.leaves(c.tree.night)
c.tree.day.signif = zero.out.insignif.leaves(c.tree.day)

zero.out.insignif = FALSE
if (zero.out.insignif) {
  dt.TE = data.table(id=dt.reg[, id], TE.peak=predict(c.tree.peak.signif, newdata=dt.reg),
                     TE.night=predict(c.tree.night.signif, newdata=dt.reg), 
                     TE.day=predict(c.tree.day.signif, newdata=dt.reg))
} else {
  dt.TE = data.table(id=dt.reg[, id], TE.peak=predict(c.tree.peak, newdata=dt.reg),
                     TE.night=predict(c.tree.night, newdata=dt.reg), 
                     TE.day=predict(c.tree.day, newdata=dt.reg))
}

# Merge to dt (but only for ones which we actually have survey data)  
# Also, I can only do this for the treatment group, since I don't know what treatment the 
# control group would have received. I also do not know how "aware" any individual control 
# household would have been.
load(output_dir%&%'/consumption_panel.Rdata')
dt.TE = merge( dt, dt.TE, by='id')
rm(dt); gc()
dt.TE[period=='Peak', TE:=TE.peak]
dt.TE[period=='Night', TE:=TE.night]
dt.TE[period=='Day', TE:=TE.day]
dt.TE[, `:=`(TE.peak=NULL, TE.night=NULL, TE.day=NULL)] 

# Compute half-hourly costs under the counterfactual, according to
# kwh_trt = kwh_ctrl * (1+TE), implying,
# kwh_ctrl = kwh_trt / (1+TE)
# The counterfactual in the baseline period is treated
# The counterfactual in the treatment period is control
dt.TE[trt.period==0, kwh.cf := kwh*(1+TE)]
dt.TE[trt.period==0, elec.cost := kwh*14.1/100] # in baseline period, flat rate (actual)
dt.TE[trt.period==0, elec.cost.cf.ze := kwh*rate/100] # in baseline period, varying rate (counter-factual)
dt.TE[trt.period==0, elec.cost.cf.pr := kwh.cf*rate/100] # in baseline period, varying rate (counter-factual)

dt.TE[trt.period==1, kwh.cf := kwh/(1+TE)]
dt.TE[trt.period==1, elec.cost := kwh*rate/100] # in treatment period, varying rate (actual)
dt.TE[trt.period==1, elec.cost.cf.ze := kwh*14.1/100] # in treatment period, flat rate (counter-factual)
dt.TE[trt.period==1, elec.cost.cf.pr := kwh.cf*14.1/100] # in treatment period, flat rate (counter-factual)

# Compute hourly heterogeneous DWL. Note that these are written where (-) means a loss, i.e., DWL
# Note that the few TEs that of the wrong sign positive (so price up => quantity up) will suggest benefits
dt.TE[trt.period==0, DWL.tou.vs.flat := 0] # zero DWL since no price change (delta price=0)
dt.TE[trt.period==0, CS.tou.vs.flat := 0] # zero CS change since no price change
dt.TE[trt.period==1, DWL.tou.vs.flat := (rate-14.1)/100*(kwh-kwh.cf)/2] # observed is treated (under rate). cf is counterfactual flat pricing (triangle)
dt.TE[trt.period==1 & rate<14.1 & kwh>kwh.cf, DWL.tou.vs.flat := abs((rate-14.1)/100*(kwh-kwh.cf)/2)] # when rate is lower and consumption rises, the CS should be positive
dt.TE[trt.period==1, CS.tou.vs.flat := DWL.tou.vs.flat - (rate-14.1)/100*kwh] # higher price on all consumed units (rectangle)

# A few id's are missing 1 or 2 days of observations, so sum(cost)!=mean(cost)*(periods in year)
# For these, just take the mean and scale up by the number of periods in the year
max.obs.base = dt.TE[trt.period==0, .N, by=.(id, trt.period)][, max(N)]
max.obs.trt = dt.TE[trt.period==1, .N, by=.(id, trt.period)][, max(N)]

# ~12,000 half-hours. we don't have ~17,500 = 48*365 because we dropped weekends and holidays
# i.e., max.obs.trt/(48*365) is approximately 5/7 (a bit less because there's also holidays)
max.obs.base + max.obs.trt
dt.TE[, uniqueN(date)]*48 # = 17856

dt.TE.sum = dt.TE[, .(kwh.actual = mean(kwh), kwh.cf = mean(kwh.cf), elec.cost.actual = mean(elec.cost), 
                      elec.cost.cf.ze = mean(elec.cost.cf.ze), elec.cost.cf.pr = mean(elec.cost.cf.pr),
                      DWL.tou.vs.flat = mean(DWL.tou.vs.flat), CS.tou.vs.flat = mean(CS.tou.vs.flat)),
                  by=.(id, trt.period, tar_stim)]
rm(dt.TE)
gc()
dt.TE.sum
baseline.months = 5.5 # (mid July to end of December: 12, 11, 10, 9, 8, + half of 7)
trt.months = 12

# Scale up to get monthly averages, instead of half-hourly averages
dt.TE.sum[trt.period==0, kwh.actual := kwh.actual*max.obs.base/baseline.months]
dt.TE.sum[trt.period==0, kwh.cf := kwh.cf*max.obs.base/baseline.months]
dt.TE.sum[trt.period==0, elec.cost.actual := elec.cost.actual*max.obs.base/baseline.months]
dt.TE.sum[trt.period==0, elec.cost.cf.ze := elec.cost.cf.ze*max.obs.base/baseline.months]
dt.TE.sum[trt.period==0, elec.cost.cf.pr := elec.cost.cf.pr*max.obs.base/baseline.months]
dt.TE.sum[trt.period==0, DWL.tou.vs.flat := DWL.tou.vs.flat*max.obs.base/baseline.months] # doesn't do anything. always 0
dt.TE.sum[trt.period==0, CS.tou.vs.flat := CS.tou.vs.flat*max.obs.base/baseline.months] # doesn't do anything. always 0

dt.TE.sum[trt.period==1, kwh.actual := kwh.actual*max.obs.trt/trt.months]
dt.TE.sum[trt.period==1, kwh.cf := kwh.cf*max.obs.trt/trt.months]
dt.TE.sum[trt.period==1, elec.cost.actual := elec.cost.actual*max.obs.trt/trt.months]
dt.TE.sum[trt.period==1, elec.cost.cf.ze := elec.cost.cf.ze*max.obs.trt/trt.months]
dt.TE.sum[trt.period==1, elec.cost.cf.pr := elec.cost.cf.pr*max.obs.trt/trt.months]
dt.TE.sum[trt.period==1, DWL.tou.vs.flat := DWL.tou.vs.flat*max.obs.trt/trt.months]
dt.TE.sum[trt.period==1, CS.tou.vs.flat := CS.tou.vs.flat*max.obs.trt/trt.months]

# For differential bill/welfare effects
dt.TE.sum = merge(dt.TE.sum, dt.reg[, .(id, d_aware_of_tariff_change)], by='id')

dt.TE.sum[trt.period==1, bill.change.tou.vs.flat := elec.cost.actual - elec.cost.cf.pr]
dt.TE.sum[trt.period==1, bill.change.tou.vs.flat.ze := elec.cost.actual - elec.cost.cf.ze]

# Average annual bill effects (in euro), for Table A.3:
# Average
sapply(dt.TE.sum[trt.period==1 & tar_stim!='EE', 
                 .(LossNoTrans = -DWL.tou.vs.flat, 
                   LossWithTrans = -CS.tou.vs.flat,
                   BillChange = bill.change.tou.vs.flat)],
       mean)*12

# BM Bill Group
sapply(dt.TE.sum[trt.period==1  & as.character(tar_stim) %in% c('A1','B1','C1','D1'), 
                 .(LossNoTrans = -DWL.tou.vs.flat, 
                   LossWithTrans = -CS.tou.vs.flat,
                   BillChange = bill.change.tou.vs.flat)],
       mean)*12

# IHD Group
sapply(dt.TE.sum[trt.period==1  & as.character(tar_stim) %in% c('A3','B3','C3','D3'), 
                 .(LossNoTrans = -DWL.tou.vs.flat, 
                   LossWithTrans = -CS.tou.vs.flat,
                   BillChange = bill.change.tou.vs.flat)],
       mean)*12


## Plot Distributions of Bill savings 
if (write.out==TRUE) pdf(file=graph_dir%&%'/Bill-Change-Distribution-BMBill-vs-IHD-'%&%today()%&%'.pdf', family='serif', width=10, height=7)
par(mfrow=c(1,1), family='serif', mai=c(0.85,0.85,0.4,0.15)) # for pdf output
xgrid = seq(-50,100, by=10)
dt.TE.sum[trt.period==1 & as.character(tar_stim) %in% c('A1','B1','C1','D1'), 
          plot(density( bill.change.tou.vs.flat*12, adj=1.5), 
               col='red', lwd=2, xlab='Euro (€)', ylim=c(0,0.05), xlim=range(xgrid), xaxt='n',
               main='', cex.lab=1.5, cex.axis=1.5)]
# main='Estimated Changes in Total Annual Bills (€) from TOU Pricing')]
axis(1, xgrid, cex.axis=1.5)
abline(v=0, lty=1, lwd=2); grid(col='gray70', nx=NA, ny=NULL); abline(v=xgrid, col='gray70',lty=3)
dt.TE.sum[trt.period==1 & as.character(tar_stim) %in% c('A3','B3','C3','D3'), 
          lines(density(bill.change.tou.vs.flat*12, adj=1.5), col='green', lwd=2)]

mean.bill.pr.bmbill = dt.TE.sum[trt.period==1 & as.character(tar_stim) %in% c('A1','B1','C1','D1'), mean(bill.change.tou.vs.flat*12)]
mean.bill.pr.ihd = dt.TE.sum[trt.period==1 & as.character(tar_stim) %in% c('A3','B3','C3','D3'), mean(bill.change.tou.vs.flat*12)]
c(mean.bill.pr.bmbill, mean.bill.pr.ihd)

abline(v=mean.bill.pr.bmbill, lty=2, col='red', lwd=2)
abline(v=mean.bill.pr.ihd, lty=2, col='green', lwd=2)
legend(x=15, y=0.04, legend=c('Bi-monthly Bill Group (Avg = €'%&%round(mean.bill.pr.bmbill,2)%&%')',
                              'IHD Group (Avg = €'%&%round(mean.bill.pr.ihd,2)%&%')'), 
       bty='n', lty=1, col=c('red','green'), lwd=2, cex=1.5)
dev.off()

## 10) Estimating avoided capital costs ----
# Capital costs:
# Brattle $1.012, $0.948, $0.903, $0.971, $0.931
# EIA: $0.973
# EIA Fixed OM: $7.34/kW-yr
# Brattle recommended discount rate
cap.cost.per.watt = mean( c(mean(c(1.012, 0.948, 0.903, 0.971, 0.931)), 0.973*1.03^(2018-2012)) )
npv.fom.cost = (7.34*1.03^(2018-2012))/1000*(1-(1+.08)^-20)/0.08 # assuming a 20 year economic life, from Brattle
cost.per.watt = cap.cost.per.watt + npv.fom.cost

# Convert to euro
eurusd = fread(data_dir%&%'/Misc/eur_usd.csv')
eurusd[, Date:=as.Date(Date)]
fx.2010 = eurusd[between(Date, '2010-01-01', '2010-12-31'), mean(EURUSD_Curncy)]

cost.per.watt = cost.per.watt / fx.2010 # convert to euro

# When does peak load occur, and how big is it?
sem[, t:=year(date)+(month(date)-1)/12+(day(date)-1)/(365.25)+half.hr/(365.25*48)]
plot(load ~ t, data=sem, type='l')
# When was peak load during the 2009-10 winter season?
sem[between(date,'2009-06-01','2010-06-01')][load %in% sem[between(date,'2009-06-01','2010-06-01')][, max(load), by=year(date)][, V1],]
# January 7, 2010
sem.peak.load = sem[date=='2010-01-07' & half.hr==36, load]
weekdays(as.Date('2010-01-07')) # a Thursday, when TOU would be ineffect

## Grab LATE under each scenario
lm.fit.by.stim = lm(d.kwh.Peak ~ stim.bm.bill.ihd+stim.bm.bill.olr+stim.mon.bill+stim.bm.bill, data=dt.reg)
TE.stim = lm.fit.by.stim$coefficients[-1]

LATE = c()
peak.mean.kwh = c()

LATE[1] = c.tree.peak$frame[1,'yval'] # average effect

# Find baseline average consumption on this date
load(file=output_dir%&%'/consumption_panel.Rdata')
peak.mean.kwh = dt[half.hr==36][date=='2010-01-07'][tar_stim!='EE', mean(kwh)]

LATE[2] = TE.stim['stim.bm.bill'] # BM bill
peak.mean.kwh[2] = dt[half.hr==36][date=='2010-01-07'][grep('1$', tar_stim), mean(kwh)] # just BM bill

LATE[3] = TE.stim['stim.bm.bill.ihd'] # BM bill
peak.mean.kwh[3] = dt[half.hr==36][date=='2010-01-07'][grep('3$', tar_stim), mean(kwh)] # just IHD bill
rm(dt)

peak.mean.load = peak.mean.kwh*2 # average treated peak load = 1.16 kW
peak.mean.load
peak.mean.load/(1+LATE) # untreated peak load
# trtd load = untrted*(1+LATE) = untrted + untrtd*LATE
# reduction = trted - untrted = trted  - trted/(1+LATE) = trted*(1+LATE-1)/(1+LATE)
#           = trted*LATE/(1+LATE)
# It's like scaling up treated to get untreated (dividing by (1+LATE) then computing the reduction (*LATE))
peak.mean.load*LATE/(1+LATE) # average reduction in peak load = -0.323 kw
peak.mean.load - peak.mean.load/(1+LATE) # same thing: observed vs counterfactual

# Calculate benefits from reduced required capacity (Table A.4)
avoided.cost.per.cap = -cost.per.watt*peak.mean.load/(1+LATE)*1000*LATE # per-household value of reduced peak load
avoided.cost.per.cap # savings PER HOUSEHOLD in Euro

# in 2010, average consumption was 4410.8 kWh in this sample, according to page 64 of http://www.seai.ie/Publications/Statistics_Publications/Energy_in_Ireland/Energy-in-Ireland-1990-2015.pdf
typical.irish.cons.2015 = 4470 #
typical.irish.load.2015 = typical.irish.cons.2015/(365*24)
# Irish load is ~0.5 kW

# US load is ~1.1 kW per household.
# typical.us.load.2016 = 1407394/123.229/(366*24) # mln kwh / 123 mln hhs / (366*24) (2016 numbers so leap year)
typical.us.load.2015 = 1404096/124.587/(365*24) # mln kwh / 124 mln hhs / (365*24) 
# load from: https://www.eia.gov/electricity/data/browser/#/topic/5?agg=0,1&geo=g&endsec=vg&linechart=ELEC.SALES.US-TRA.A~ELEC.SALES.US-RES.A~ELEC.SALES.US-COM.A~ELEC.SALES.US-IND.A~ELEC.SALES.US-OTH.A&columnchart=ELEC.SALES.US-ALL.A~ELEC.SALES.US-RES.A~ELEC.SALES.US-COM.A~ELEC.SALES.US-IND.A&map=ELEC.SALES.US-ALL.A&freq=A&start=2010&end=2016&ctype=linechart&ltype=pin&rtype=s&pin=&rse=0&maptype=0
# #households from https://www.census.gov/data/tables/time-series/demo/families/households.html
# Compare US and Ire loads
typical.irish.load.2015/typical.us.load.2015-1
typical.us.load.2015/typical.irish.load.2015

# Extrapolate to US benefits per household (Table A.4)
us.avoided.cost.per.cap = typical.us.load.2015/typical.irish.load.2015*avoided.cost.per.cap*fx.2010
us.avoided.cost.per.cap # in dollars

# https://www.eia.gov/electricity/data/browser/#/topic/5?agg=0,1&geo=g&endsec=vg&linechart=ELEC.SALES.US-TRA.A~ELEC.SALES.US-RES.A~ELEC.SALES.US-COM.A~ELEC.SALES.US-IND.A~ELEC.SALES.US-OTH.A&columnchart=ELEC.SALES.US-ALL.A~ELEC.SALES.US-RES.A~ELEC.SALES.US-COM.A~ELEC.SALES.US-IND.A&map=ELEC.SALES.US-ALL.A&freq=A&start=2010&end=2016&ctype=linechart&ltype=pin&rtype=s&pin=&rse=0&maptype=0
# benefit.usa/.379 # https://www.eia.gov/electricity/data/browser/#/topic/5?agg=0,1&geo=vvvvvvvvvvvvo&endsec=vg&linechart=ELEC.SALES.US-ALL.A&columnchart=ELEC.SALES.US-ALL.A&map=ELEC.SALES.US-ALL.A&freq=A&ctype=linechart&ltype=pin&maptype=0&rse=0&pin=
# Alternatively: that % reduction in needed US capacity
# North America 2017 summer capacity = 1,063,021 MW
# cost.per.watt*1063021*LATE*0.379 # $/W * MW = $millions
cost.per.watt*1063021*-0.01 # $/W * MW = $millions
# Source: Table 4.1 of http://www.nerc.com/pa/RAPA/ra/Reliability%20Assessments%20DL/2016%20Long-Term%20Reliability%20Assessment.pdf

## 11) Check various side statistics ----
# decreasing = # #Share with electric heating?
survey[, ifelse(d_heat_elec_plugin==1 | d_heat_elec_central==1, 1, 0)]
grep('heat', names(dt.reg), v=T)
dt.reg[, table(d_heat_elec_central, d_heat_elec_plugin, exclude=NULL)]

sh.table = function(x, ndigit=2) round(table(x, exclude=NULL)/x[,.N], ndigit)*100

sh.table(dt.reg[, .(d_heat_elec_central, d_heat_elec_plugin)])
# Only 7% have some form of electric HOME heating (not counting water heating)

sh.table(dt.reg[, .(d_heat_oil, d_heat_gas)])
# People primarily use oil (59%) and gas (30%) for waterheat

sh.table(dt.reg[, .(d_waterheater_elec_immersion, d_waterheater_elec_instant)])
# Meanwhile, about 56% have some form of electric WATER heat (primarily immersion)

sh.table(dt.reg[, .(d_waterheater_gas, d_waterheater_oil)])
# Plus, people often use oil (40%) and gas (23%) for waterheat

## Overall reduction in consumption across all treated days (avg reduction in consump.)
load(file=output_dir%&%'/consumption_panel.Rdata')
dt.cons = dt
# dt.cons[, any.trt:=ifelse(tar_stim=='EE',0,1)]
dt.cons
dt.cons.sum = dt.cons[, .(kwh=mean(kwh)), by=.(id, any.trt, trt.period)]
rm(dt)
rm(dt.cons)

# fe.fit.cons = felm(log(kwh+0.0001) ~ any.trt:trt.period + trt.period | id+mon | 0 | id+mon, data=dt.cons)
fe.fit.cons.sum = felm(log(kwh+0.0001) ~ any.trt:trt.period + trt.period | id | 0 | id, data=dt.cons.sum)
summary(fe.fit.cons.sum) # 2.3% reduction in average consumption

## How many variables are used in the tree? 
x = copy(dt.reg)
x = x[, names(which(check.nas(x)==x[,.N])), with=FALSE]
dim(x) # 168 complete variables
names(x) # some of these are "Y" variables, or excluded from the X (e.g., cv.grps)
x = x[, -c('id', 'any.trt','d.kwh.Peak','d.kwh.Night','d.kwh.Day','cv.grps', 'prop.score'), with=FALSE]
dim(x) # this leaves 161 variables

## Check (per reviewer comment): Treatment effect interacted with a statistic reflecing
# the percentage of appliances that could be adjusted.
names(dt.reg)
# Easily adjustable appliances
# n_washing_machine+n_tumble_dryer+n_dishwasher+n_immersion+
#   n_elec_shower_instant+n_elec_shower_pumped+n_elec_cook+n_elec_heater_appliance_plugin+
#   n_standalone_freezer+n_water_pump
dt.adj = copy(dt.reg)
dt.adj[, sh_appl_adj := (n_washing_machine + n_tumble_dryer + n_dishwasher + n_immersion)/n_appliances]
dt.adj[n_appliances==0, sh_appl_adj := 0]

# Test if treatment group variaes on adjustable appliances
dt.adj[, mean(sh_appl_adj)]
dt.adj[, mean(sh_appl_adj), by=any.trt]
summary(lm(sh_appl_adj ~ any.trt, data=dt.adj)) # no difference

# Histogram
dt.adj[, hist(sh_appl_adj, main='', xlab='Share Easily Adjustable', family='serif', ylim=c(0,1200), col='lightblue')]
grid(col='gray70')

# Test if easily adjustable appliances affect the responsiveness
summary(lm(d.kwh.Peak ~ any.trt*sh_appl_adj, data=dt.adj)) 
# no difference (p=0.69)

# Check individual devices
summary(lm(d.kwh.Peak ~ any.trt*d_heater_timer, data=dt.adj)) 
summary(lm(d.kwh.Peak ~ any.trt*d_waterheater_timer, data=dt.adj)) 
# Check interaction with price level
summary(lm(d.kwh.Peak ~ -1 + any.trt*sh_appl_adj*(tariff.A + tariff.B + tariff.C + tariff.D), data=dt.adj)) 
summary(lm(d.kwh.Peak ~ 1 + sh_appl_adj*(I(tariff.A==TRUE) + I(tariff.B==TRUE) + I(tariff.C==TRUE) + I(tariff.D==TRUE)), data=dt.adj)) 
